import numpy as np
import pandas as pd
import statsmodels.api as sm
import math
import sys

## Linear Estimator
def regEstimate(dataset, pbvarb = 'predicted_prob_black', outcome = 'audited'):
        model = sm.OLS.from_formula(outcome+' ~ '+pbvarb, data = dataset).fit(cov_type = 'HC1')
        coef = model.params[pbvarb]
        se = model.bse[pbvarb]
        black_aud_rate = model.params[0] + model.params[1]
        non_black_aud_rate = model.params[0] 
        print("number of obs :" + str(model.nobs))
        return coef, se, black_aud_rate, non_black_aud_rate

## Linear Estimator with controls
def regEstimate_control(dataset, pbvarb = 'predicted_prob_black', controls = '', outcome = 'audited'):
        model = sm.OLS.from_formula(outcome+' ~ '+pbvarb+' + '+controls, data = dataset).fit(cov_type = 'HC1')
        coef = model.params[pbvarb]
        se = model.bse[pbvarb]
        results = model.summary()
        print("number of obs :"+ str(model.nobs))
        return coef, se

## Probabilistic Estimator 
def chenEstimate(dataset, pbvarb = 'predicted_prob_black', outcome = 'audited'):
        black_aud_rate = (dataset[pbvarb]*dataset[outcome]).sum()/(dataset[pbvarb]).sum()
        non_black_aud_rate = ((1-dataset[pbvarb])*dataset[outcome]).sum()/(1-dataset[pbvarb]).sum()
        est =  black_aud_rate - non_black_aud_rate
        return est, black_aud_rate, non_black_aud_rate

## Get standard errors
def getSEs(dataset,pbvarb='predicted_prob_black',outcome='audited', seReg = 0.0):
        seMultiplier=getSEMultiplier(dataset,pbvarb)
        #seReg = regEstimate(dataset,pbvarb,outcome)[1]
        seChen = seReg*seMultiplier
        return seChen, seReg

def getSEMultiplier(dataset,pbvarb='predicted_prob_black'):
        return np.sqrt(dataset[pbvarb].var()/(dataset[pbvarb].mean()*(1-dataset[pbvarb].mean())))


### Operational audits and predicted race probabilitiess
dataBISG = pd.read_csv("/REDACTED/data/final/individualBISG2014_full_final.csv")
dataBISG= dataBISG[~dataBISG.predicted_prob_black.isna()]
extract = pd.read_stata('/REDACTED/HertzGraff/cam_eic_extract.dta')

extract = extract.rename(columns={"taxpayer_id":"taxpayer_id_new", "taxpayer_id_typ":"taxpayer_id_typ_new", "cycle_posted":"cycle_posted_new", "eic":"eic_new", "eitc_amt_computer":"eitc_amt_computer_new"})
dataBISG = dataBISG.merge(extract, on='taxpayer_id_new', how='left')
dataBISG['isEIC_new'] = np.where(dataBISG['eic_new'] > 0, 1, 0)

##########################################################
#### Table 1
##########################################################
dataBISG.predicted_prob_black.mean()


# write results out to a txt file
sys.stdout = open("/REDACTED/aud_disp_time_type_update.txt", "w")

eitc_pop = dataBISG.loc[dataBISG['isEIC_new'] == 1]
non_eitc_pop = dataBISG.loc[dataBISG['isEIC_new'] == 0]

## Start with Overall Population
print("Full population: \n")

# probabilistic estimates
probDisp_any = chenEstimate(dataBISG, pbvarb = 'predicted_prob_black', outcome = 'aud_no_research_audits')
probDisp_pre = chenEstimate(dataBISG, pbvarb = 'predicted_prob_black', outcome = 'pre_refund')
probDisp_post = chenEstimate(dataBISG, pbvarb = 'predicted_prob_black', outcome = 'post_ref')
probDisp_corr = chenEstimate(dataBISG, pbvarb = 'predicted_prob_black', outcome = 'corr_aud')
probDisp_ncorr = chenEstimate(dataBISG, pbvarb = 'predicted_prob_black', outcome = 'non_corr_aud')

# linear estimates
linDisp_any = regEstimate(dataBISG, pbvarb = 'predicted_prob_black', outcome = 'aud_no_research_audits')
linDisp_pre = regEstimate(dataBISG, pbvarb = 'predicted_prob_black', outcome = 'pre_refund')
linDisp_post = regEstimate(dataBISG, pbvarb = 'predicted_prob_black', outcome = 'post_ref')
linDisp_corr = regEstimate(dataBISG, pbvarb = 'predicted_prob_black', outcome = 'corr_aud')
linDisp_ncorr = regEstimate(dataBISG, pbvarb = 'predicted_prob_black', outcome = 'non_corr_aud')

# standard errors
any_SE = getSEs(dataBISG,pbvarb='predicted_prob_black',outcome='aud_no_research_audits', seReg = float(linDisp_any[1]))
pre_SE = getSEs(dataBISG,pbvarb='predicted_prob_black',outcome='pre_refund', seReg = float(linDisp_pre[1]))
post_SE = getSEs(dataBISG,pbvarb='predicted_prob_black',outcome='post_ref', seReg = float(linDisp_post[1]))
corr_SE = getSEs(dataBISG,pbvarb='predicted_prob_black',outcome='corr_aud', seReg = float(linDisp_corr[1]))
ncorr_SE = getSEs(dataBISG,pbvarb='predicted_prob_black',outcome='non_corr_aud', seReg = float(linDisp_ncorr[1]))

# overall audit rates
overall_ar = dataBISG.aud_no_research_audits.mean()
pre_ar = dataBISG.pre_refund.mean()
post_ar = dataBISG.post_ref.mean()
corr_ar = dataBISG.corr_aud.mean()
ncorr_ar = dataBISG.non_corr_aud.mean()

# print results
print("Prob Disp:")
print("any:  " + str(round(100*probDisp_any[0], 3)))
print("pre:  " + str(round(100*probDisp_pre[0], 3)))
print("post:  " + str(round(100*probDisp_post[0], 3)))
print("corr:  " + str(round(100*probDisp_corr[0], 3)))
print("non_corr:  " + str(round(100*probDisp_ncorr[0], 3)))

print("Lin Disp:")
print("any:  " + str(round(100*linDisp_any[0], 3)))
print("pre:  " + str(round(100*linDisp_pre[0], 3)))
print("post:  " + str(round(100*linDisp_post[0], 3)))
print("corr:  " + str(round(100*linDisp_corr[0], 3)))
print("non_corr:  " + str(round(100*linDisp_ncorr[0], 3)))

print("SE [prob, lin]:")
print("any:  " + str(tuple(round(100*x, 3) for x in any_SE)))
print("pre:  " + str(tuple(round(100*x, 3) for x in pre_SE)))
print("post:  " + str(tuple(round(100*x, 3) for x in post_SE)))
print("corr:  " + str(tuple(round(100*x, 3) for x in corr_SE)))
print("non_corr:  " + str(tuple(round(100*x, 3) for x in ncorr_SE)))

print("Mean Audit Rates:")
print("any: " + str(round(100*overall_ar, 3)))
print("pre: " + str(round(100*pre_ar, 3)))
print("post: " + str(round(100*post_ar, 3)))
print("corr: " + str(round(100*corr_ar, 3)))
print("non_corr: " + str(round(100*ncorr_ar, 3)))



## EITC population
print("\nEITC population: \n")


# probabilistic estimates
eic_probDisp_any = chenEstimate(eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'aud_no_research_audits')
eic_probDisp_pre = chenEstimate(eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'pre_refund')
eic_probDisp_post = chenEstimate(eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'post_ref')
eic_probDisp_corr = chenEstimate(eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'corr_aud')
eic_probDisp_ncorr = chenEstimate(eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'non_corr_aud')

# linear estimates
eic_linDisp_any = regEstimate(eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'aud_no_research_audits')
eic_linDisp_pre = regEstimate(eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'pre_refund')
eic_linDisp_post = regEstimate(eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'post_ref')
eic_linDisp_corr = regEstimate(eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'corr_aud')
eic_linDisp_ncorr = regEstimate(eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'non_corr_aud')

# standard errors
eic_any_SE = getSEs(eitc_pop,pbvarb='predicted_prob_black',outcome='aud_no_research_audits', seReg = float(eic_linDisp_any[1]))
eic_pre_SE = getSEs(eitc_pop,pbvarb='predicted_prob_black',outcome='pre_refund', seReg = float(eic_linDisp_pre[1]))
eic_post_SE = getSEs(eitc_pop,pbvarb='predicted_prob_black',outcome='post_ref', seReg = float(eic_linDisp_post[1]))
eic_corr_SE = getSEs(eitc_pop,pbvarb='predicted_prob_black',outcome='corr_aud', seReg = float(eic_linDisp_corr[1]))
eic_ncorr_SE = getSEs(eitc_pop,pbvarb='predicted_prob_black',outcome='non_corr_aud', seReg = float(eic_linDisp_ncorr[1]))

# overall audit rates
eic_overall_ar = eitc_pop.aud_no_research_audits.mean()
eic_pre_ar = eitc_pop.pre_refund.mean()
eic_post_ar = eitc_pop.post_ref.mean()
eic_corr_ar = eitc_pop.corr_aud.mean()
eic_ncorr_ar = eitc_pop.non_corr_aud.mean()

# print results
print("Prob Disp:")
print("any:  " + str(round(100*eic_probDisp_any[0], 3)))
print("pre:  " + str(round(100*eic_probDisp_pre[0], 3)))
print("post:  " + str(round(100*eic_probDisp_post[0], 3)))
print("corr:  " + str(round(100*eic_probDisp_corr[0], 3)))
print("non_corr:  " + str(round(100*eic_probDisp_ncorr[0], 3)))

print("Lin Disp:")
print("any:  " + str(round(100*eic_linDisp_any[0], 3)))
print("pre:  " + str(round(100*eic_linDisp_pre[0], 3)))
print("post:  " + str(round(100*eic_linDisp_post[0], 3)))
print("corr:  " + str(round(100*eic_linDisp_corr[0], 3)))
print("non_corr:  " + str(round(100*eic_linDisp_ncorr[0], 3)))

print("SE [prob, lin]:")
print("any:  " + str(tuple(round(100*x, 3) for x in eic_any_SE)))
print("pre:  " + str(tuple(round(100*x, 3) for x in eic_pre_SE)))
print("post:  " + str(tuple(round(100*x, 3) for x in eic_post_SE)))
print("corr:  " + str(tuple(round(100*x, 3) for x in eic_corr_SE)))
print("non_corr:  " + str(tuple(round(100*x, 3) for x in eic_ncorr_SE)))

print("Mean Audit Rates:")
print("any: " + str(round(100*eic_overall_ar, 3)))
print("pre: " + str(round(100*eic_pre_ar, 3)))
print("post: " + str(round(100*eic_post_ar, 3)))
print("corr: " + str(round(100*eic_corr_ar, 3)))
print("non_corr: " + str(round(100*eic_ncorr_ar, 3)))

## Non EITC population
print("\nNon EITC population: \n")

# probabilistic estimates
non_eic_probDisp_any = chenEstimate(non_eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'aud_no_research_audits')
non_eic_probDisp_pre = chenEstimate(non_eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'pre_refund')
non_eic_probDisp_post = chenEstimate(non_eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'post_ref')
non_eic_probDisp_corr = chenEstimate(non_eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'corr_aud')
non_eic_probDisp_ncorr = chenEstimate(non_eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'non_corr_aud')

# linear estimates
non_eic_linDisp_any = regEstimate(non_eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'aud_no_research_audits')
non_eic_linDisp_pre = regEstimate(non_eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'pre_refund')
non_eic_linDisp_post = regEstimate(non_eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'post_ref')
non_eic_linDisp_corr = regEstimate(non_eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'corr_aud')
non_eic_linDisp_ncorr = regEstimate(non_eitc_pop, pbvarb = 'predicted_prob_black', outcome = 'non_corr_aud')

# standard errors
non_eic_any_SE = getSEs(non_eitc_pop,pbvarb='predicted_prob_black',outcome='aud_no_research_audits', seReg = float(non_eic_linDisp_any[1]))
non_eic_pre_SE = getSEs(non_eitc_pop,pbvarb='predicted_prob_black',outcome='pre_refund', seReg = float(non_eic_linDisp_pre[1]))
non_eic_post_SE = getSEs(non_eitc_pop,pbvarb='predicted_prob_black',outcome='post_ref', seReg = float(non_eic_linDisp_post[1]))
non_eic_corr_SE = getSEs(non_eitc_pop,pbvarb='predicted_prob_black',outcome='corr_aud', seReg = float(non_eic_linDisp_corr[1]))
non_eic_ncorr_SE = getSEs(non_eitc_pop,pbvarb='predicted_prob_black',outcome='non_corr_aud', seReg = float(non_eic_linDisp_ncorr[1]))

# overall audit rates
non_eic_overall_ar = non_eitc_pop.aud_no_research_audits.mean()
non_eic_pre_ar = non_eitc_pop.pre_refund.mean()
non_eic_post_ar = non_eitc_pop.post_ref.mean()
non_eic_corr_ar = non_eitc_pop.corr_aud.mean()
non_eic_ncorr_ar = non_eitc_pop.non_corr_aud.mean()

# print results
print("Prob Disp:")
print("any:  " + str(round(100*non_eic_probDisp_any[0], 3)))
print("pre:  " + str(round(100*non_eic_probDisp_pre[0], 3)))
print("post:  " + str(round(100*non_eic_probDisp_post[0], 3)))
print("corr:  " + str(round(100*non_eic_probDisp_corr[0], 3)))
print("non_corr:  " + str(round(100*non_eic_probDisp_ncorr[0], 3)))

print("Lin Disp:")
print("any:  " + str(round(100*non_eic_linDisp_any[0], 3)))
print("pre:  " + str(round(100*non_eic_linDisp_pre[0], 3)))
print("post:  " + str(round(100*non_eic_linDisp_post[0], 3)))
print("corr:  " + str(round(100*non_eic_linDisp_corr[0], 3)))
print("non_corr:  " + str(round(100*non_eic_linDisp_ncorr[0], 3)))

print("SE [prob, lin]:")
print("any:  " + str(tuple(round(100*x, 3) for x in non_eic_any_SE)))
print("pre:  " + str(tuple(round(100*x, 3) for x in non_eic_pre_SE)))
print("post:  " + str(tuple(round(100*x, 3) for x in non_eic_post_SE)))
print("corr:  " + str(tuple(round(100*x, 3) for x in non_eic_corr_SE)))
print("non_corr:  " + str(tuple(round(100*x, 3) for x in non_eic_ncorr_SE)))

print("Mean Audit Rates:")
print("any: " + str(round(100*non_eic_overall_ar, 3)))
print("pre: " + str(round(100*non_eic_pre_ar, 3)))
print("post: " + str(round(100*non_eic_post_ar, 3)))
print("corr: " + str(round(100*non_eic_corr_ar, 3)))
print("non_corr: " + str(round(100*non_eic_ncorr_ar, 3)))
print(len(eitc_pop))
print(len(non_eitc_pop))
print(len(dataBISG))




sys.stdout = sys.__stdout__

